org 100h   ; assume ax=bx=0 ch=0

  pop di   ; di=sp=0

;Prepare floating-point constants for SSE
;[0xfff0]=0xffe00000, [0xffe0]=0xffc00000, ... [0x8000]=0
;step 0x00200000: ... 1 1.25 1.5 1.75  2 2.5 3 3.5  4 5 6 7 ...
PK:
  mov cl,4
  sub ax,0x20
PKL:
  push ax  ; x
  push bx  ; 0
  loop PKL ; store four times
  jnz PK   ; loop 2048 times -> 32kB; sp=0x8000


  mov al,13h
  int 10h
  push 0xa000
  pop es
  fninit

;;palette test
;SEGMENTS equ 8
;XRUN     equ 6
;YRUN     equ 6
;START    equ 0
;
;%if YRUN<256
;  %define mov__cx_YRUN mov cl,YRUN
;%else
;  %define mov__cx_YRUN mov cx,YRUN
;%endif
;
;%if XRUN<256
;  %define mov__cx_XRUN mov cl,XRUN
;%else
;  %define mov__cx_XRUN mov cx,XRUN
;%endif
;
;  pusha
;  salc
;  scasw
;PW mov__cx_YRUN
;PY mov bx,SEGMENTS
;  pusha
;  add al,START
;PX mov__cx_XRUN
;  rep stosb
;  inc ax
;  dec bx
;  jnz PX
;  popa
;  add di,320
;  loop PY
;%if SEGMENTS<256
;  add al,SEGMENTS
;  jnc PW
;%endif
;  popa

;Palette: 32 Luminance * 8 Hue: diffuse = L*[0.2,H,1], specular = L^9 / 2
  mov dx,3c8h
  xor ax,ax
  out dx,al
  inc dx
PAL:
  or bx,0b0000011100011111  ; bx = LLLLL... HHH.....

;Color
  push dx    ; b=0.78
  push bx    ; g=H
  push ax    ; r=last blue output (0..0.25)

;Specular
  mov al,bh
POW:
  mul al
  mov al,ah
  inc si
  jpo POW    ; 3 times
  shr ah,1   ; cl=L^8/2 (0..127)
  mov cl,ah

;Diffuse, add with saturation
MAD:
  pop ax     ; rgb
  add al,cl  ; al=L^8/2 + rgb
  jnc SAT
  salc       ; clamp to 0..255
SAT:
  mul bh     ; ah=L*clamp(L^8/2 + rgb)
  shr ax,10
  out dx,al
  dec si
  jpo MAD    ; 3 times

  inc bx
  jnz PAL

;;palette test
;  xor ax,ax  ; wait for a key
;  int 16h
;  mov ax,3   ; textmode
;  int 10h
;  ret

%define K(x) [0x8000 + 0x10*(x/0x20)]
%define K_0_25        K(0x3e80)  ; 0.25
%define K_EPS         K(0x3c20)  ; 0.009765625
%define K_TIME_DELTA  K(0x3c00)  ; 0.0078125
%define K_MAX_STEP    K(0x4100)  ; 8
%define K_LIGHT_SCALE K(0x4540)  ; 3072 = 30/EPS
%define K_HUE_SCALE   K(0x4180)  ; 16 = 8 steps * 2
%define K_NEG_ABS     K(0x8000)  ; -0 = 0x80000000 for -abs()
%define K_MINUS1      K(0xbf80)  ; -1

;For 16:9 screens: pixel aspect ratio = 1.008
%define K_X_SCALE     K(0x2fe0)  ; 1.75 * 2**-32: x -> ..0.875
%define K_Y_SCALE     K(0x2fa0)  ; 1.25 * 2**-32: y -> ..0.488

;For 4:3 screens: pixel aspect ratio = 0.96
;%define K_X_SCALE     K(0x2fc0)  ; 1.5 * 2**-32: x -> ..0.75
;%define K_Y_SCALE     K(0x2fc0)  ; 1.5 * 2**-32: y -> ..0.586

  fldz             ;| t=0

;For each frame: prepare rotation constants
M fadd dword K_TIME_DELTA ;| t+=dt
  fld st0
  fsincos          ;| C1 S1 t
  fld st1
  fld st1          ;| C2=C1 S2=S1 C1 S1 t
  fldl2e
  fmul st5         ;| 1.442695*t C2 S2 C1 S1 t
  fsincos          ;| C3 S3 C2 S2 C1 S1 t

;Store each constant four times
  mov bx,0xa000 - 0x60
STORE:
  mov cl,4
STORE4:
  fst dword[bx]    ;bx-0x60 0x50 0x40 0x30 0x20 0x10 |0
  add bl,4         ;   C3   S3   C2   S2   C1   S1   |XY
  loop STORE4
  fstp st0
  jnz STORE        ; loop 4 times

%define COS [bx-0x60]
%define SIN [bx-0x50]

;For each pixel: store x,y coordinates
X mov bx,es
  mov cl,4
X4:
  mov ax,0xcccd
  mul di
  add dx,0x9b80
  mov [bx],ax
  mov [bx+2],dx
  add bl,4
  inc di
  loop X4      ; di+=4 bx=0xa010

%define INT_X [bx-1]  ; x = 2^32 * (-0.5..0.5)
%define INT_Y [bx]    ; y = 0xcccd * 320 * (-100..100) = 2^32 * (-0.3906..0.3906)

%define x xmm0 ; XYZ coordinates for iteration
%define y xmm1
%define z xmm2
%define o xmm3 ; output: orbit trap
%define a xmm4 ; scratch, output: estimated distance
%define b xmm5 ; scratch
%define c xmm6 ; translation [c,c/4,0]
%define d xmm7 ; depth

;Trace 32 steps along a ray
  mov cl,32
  movaps d,K_MINUS1 ; d=-1
T call MAP
  addps d,a         ; d+=map(X,Y,d)
  loop T

;Normal, ambient occlusion
  call MAP          ; bx=0xa060
  movaps [bx],a
  subps d,K_EPS
  call MAP          ; a = map(X,Y,d-EPS)
  subps a,[bx]      ; a = map(X,Y,d-EPS) - map(X,Y,d)

;Fog
;  minps Z,K1       ; Z = min(.9-Z,1)
;  mulps a,Z        ; a *= Z

;Color
  mulps a,K_LIGHT_SCALE
  mulps o,K_HUE_SCALE
  cvtps2dq a,a
  cvtps2dq o,o
  pslld a,3
  paddd a,o        ; color index = L(0..31?)*8 + H(0..7)
  packssdw a,a
  packuswb a,a     ; clamp to 0..255

;Next pixel
  movd [es:di-4],a
  test di,di
  jnz X

;Esc test, next frame
  in al,0x60
  dec al
  jnz M   ; fallthrough

MAP:
  mov bx,es
  movups x,INT_X
  cvtdq2ps y,INT_Y
  cvtdq2ps x,x
  mulps x,K_X_SCALE ; x: -1..1
  mulps y,K_Y_SCALE
  movaps z,d    ; x,y,z = X,Y,depth

  xorps o,o    ; o=0
  movaps c,K_0_25 ; c=K: translation = [R,R/4,0]
  mov ch,19    ; 19 iterations

;Rotate in the XZ, YX and ZY planes
L mov bx,es
R movaps b,COS ; b=C3 a=S3 | b=C2 a=S2 | b=C1 a=S1
  movaps a,SIN
  mulps b,z    ; b=Cz
  mulps z,a    ; z=Sz
  mulps a,x    ; a=Sx
  mulps x,COS  ; x=Cx
  subps a,b    ; a=x'=Sx-Cz
  addps z,x    ; z=z'=Sz+Cx
  movaps x,y   ; cycle x,y,z <- y,z,a
  movaps y,z
  movaps z,a
  add bl,0x20  ; 0x00 | 0x20 | 0x40
  jpo R        ; bx=0xa060

;Reflect along x and y
  orps x,K_NEG_ABS  ; x=-abs(x)
  orps y,K_NEG_ABS  ; y=-abs(y)

;Translate
  movaps a,c
  mulps a,K_0_25 ; a=K*c
  addps x,c    ; x+=c
  addps y,a    ; y+=K*c

  subps c,a    ; c*=1-K: scale translation

;Squared distance to [0,0,0]
  movaps a,x
  movaps b,y
  mulps a,a    ; a=x*x
  mulps b,b    ; b=y*y
  addps b,a    ; b=x*x+y*y
  movaps a,z
  mulps a,a    ; a=z*z
  addps b,a    ; b=length^2=x*x+y*y+z*z

;Orbit trap
  maxps o,b    ; o = max(o,length^2)

;Iterate 19 times
  dec ch
  jnz L

;Distance to a little sphere
  rsqrtps a,b  ; a=(length^2)^(-1/2)
  mulps a,b    ; a=(length^2)^(-1/2 + 1) = length

  subps a,c
  subps a,c    ; a=length-2c: radius = 2*translation
  minps a,K_MAX_STEP ; a=min(length-2c, MAX_STEP)

  ret          ; bx=0xa060
